package net.seninp.jmotif.sax;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.joda.time.Duration;
import org.joda.time.format.PeriodFormatter;
import org.joda.time.format.PeriodFormatterBuilder;
import net.seninp.jmotif.distance.EuclideanDistance;
import net.seninp.jmotif.sax.alphabet.NormalAlphabet;
import net.seninp.jmotif.sax.datastructure.SAXRecord;
import net.seninp.jmotif.sax.datastructure.SAXRecords;
/**
* Implements SAX algorithms.
*
* @author Pavel Senin
*
*/
public final class SAXProcessor {
private final TSProcessor tsProcessor;
private final NormalAlphabet na;
private EuclideanDistance ed;
/**
* Constructor.
*/
public SAXProcessor() {
super();
this.tsProcessor = new TSProcessor();
this.na = new NormalAlphabet();
this.ed = new EuclideanDistance();
}
/**
* Convert the timeseries into SAX string representation.
*
* @param ts the timeseries.
* @param paaSize the PAA size.
* @param cuts the alphabet cuts.
* @param nThreshold the normalization thresholds.
*
* @return The SAX representation for timeseries.
* @throws SAXException if error occurs.
*/
public char[] ts2string(double[] ts, int paaSize, double[] cuts, double nThreshold)
throws SAXException {
if (paaSize == ts.length) {
return tsProcessor.ts2String(tsProcessor.znorm(ts, nThreshold), cuts);
}
else {
// perform PAA conversion
double[] paa = tsProcessor.paa(tsProcessor.znorm(ts, nThreshold), paaSize);
return tsProcessor.ts2String(paa, cuts);
}
}
/**
* Converts the input time series into a SAX data structure via chunking and Z normalization.
*
* @param ts the input data.
* @param paaSize the PAA size.
* @param cuts the Alphabet cuts.
* @param nThreshold the normalization threshold value.
*
* @return SAX representation of the time series.
* @throws SAXException if error occurs.
*/
public SAXRecords ts2saxByChunking(double[] ts, int paaSize, double[] cuts, double nThreshold)
throws SAXException {
SAXRecords saxFrequencyData = new SAXRecords();
// Z normalize it
double[] normalizedTS = tsProcessor.znorm(ts, nThreshold);
// perform PAA conversion if needed
double[] paa = tsProcessor.paa(normalizedTS, paaSize);
// Convert the PAA to a string.
char[] currentString = tsProcessor.ts2String(paa, cuts);
// create the datastructure
for (int i = 0; i < currentString.length; i++) {
char c = currentString[i];
int pos = (int) Math.floor(i * ts.length / currentString.length);
saxFrequencyData.add(String.valueOf(c).toCharArray(), pos);
}
return saxFrequencyData;
}
/**
* Converts the input time series into a SAX data structure via sliding window and Z
* normalization.
*
* @param ts the input data.
* @param windowSize the sliding window size.
* @param paaSize the PAA size.
* @param cuts the Alphabet cuts.
* @param nThreshold the normalization threshold value.
* @param strategy the NR strategy.
*
* @return SAX representation of the time series.
* @throws SAXException if error occurs.
*/
public SAXRecords ts2saxViaWindow(double[] ts, int windowSize, int paaSize, double[] cuts,
NumerosityReductionStrategy strategy, double nThreshold) throws SAXException {
if (windowSize > ts.length) {
throw new SAXException(
"Unable to saxify via window, window size is greater than the timeseries length...");
}
// the resulting data structure init
//
SAXRecords saxFrequencyData = new SAXRecords();
// scan across the time series extract sub sequences, and convert them to strings
char[] previousString = null;
for (int i = 0; i <= ts.length - windowSize; i++) {
// fix the current subsection
double[] subSection = Arrays.copyOfRange(ts, i, i + windowSize);
// Z normalize it
subSection = tsProcessor.znorm(subSection, nThreshold);
// perform PAA conversion if needed
double[] paa = tsProcessor.paa(subSection, paaSize);
// Convert the PAA to a string.
char[] currentString = tsProcessor.ts2String(paa, cuts);
if (null != previousString) {
if (NumerosityReductionStrategy.EXACT.equals(strategy)
&& Arrays.equals(previousString, currentString)) {
// NumerosityReduction
continue;
}
else if (NumerosityReductionStrategy.MINDIST.equals(strategy)
&& checkMinDistIsZero(previousString, currentString)) {
continue;
}
}
previousString = currentString;
saxFrequencyData.add(currentString, i);
}
// ArrayList<Integer> keys = saxFrequencyData.getAllIndices();
// for (int i : keys) {
// System.out.println(i + "," + String.valueOf(saxFrequencyData.getByIndex(i).getPayload()));
// }
return saxFrequencyData;
}
/**
* Converts the input time series into a SAX data structure via sliding window and Z
* normalization. The difference between this function and ts2saxViaWindow is that in this
* function, Z normalization occurs on entire range, rather than the sliding window.
*
* @param ts the input data.
* @param windowSize the sliding window size.
* @param paaSize the PAA size.
* @param cuts the Alphabet cuts.
* @param nThreshold the normalization threshold value.
* @param strategy the NR strategy.
*
* @return SAX representation of the time series.
* @throws SAXException if error occurs.
*/
public SAXRecords ts2saxViaWindowGlobalZNorm(double[] ts, int windowSize, int paaSize,
double[] cuts, NumerosityReductionStrategy strategy, double nThreshold) throws SAXException {
// the resulting data structure init
//
SAXRecords saxFrequencyData = new SAXRecords();
// scan across the time series extract sub sequences, and convert them to strings
char[] previousString = null;
// normalize the entire range
double[] normalizedData = tsProcessor.znorm(ts, nThreshold);
for (int i = 0; i <= ts.length - windowSize; i++) {
// get the current subsection
double[] subSection = Arrays.copyOfRange(normalizedData, i, i + windowSize);
// perform PAA conversion if needed
double[] paa = tsProcessor.paa(subSection, paaSize);
// Convert the PAA to a string.
char[] currentString = tsProcessor.ts2String(paa, cuts);
if (null != previousString) {
if (NumerosityReductionStrategy.EXACT.equals(strategy)
&& Arrays.equals(previousString, currentString)) {
// NumerosityReduction
continue;
}
else if (NumerosityReductionStrategy.MINDIST.equals(strategy)
&& checkMinDistIsZero(previousString, currentString)) {
continue;
}
}
previousString = currentString;
saxFrequencyData.add(currentString, i);
}
return saxFrequencyData;
}
/**
* Converts the input time series into a SAX data structure via sliding window and Z
* normalization.
*
* @param ts the input data.
* @param windowSize the sliding window size.
* @param paaSize the PAA size.
* @param cuts the Alphabet cuts.
* @param nThreshold the normalization threshold value.
* @param strategy the NR strategy.
* @param skips The list of points which shall be skipped during conversion; this feature is
* particularly important when building a concatenated from pieces time series and junction shall
* not make it into the grammar.
*
* @return SAX representation of the time series.
* @throws SAXException if error occurs.
*/
public SAXRecords ts2saxViaWindowSkipping(double[] ts, int windowSize, int paaSize, double[] cuts,
NumerosityReductionStrategy strategy, double nThreshold, ArrayList<Integer> skips)
throws SAXException {
// the resulting data structure init
//
SAXRecords saxFrequencyData = new SAXRecords();
Collections.sort(skips);
int cSkipIdx = 0;
// scan across the time series extract sub sequences, and convert them to strings
char[] previousString = null;
boolean skipped = false;
for (int i = 0; i < ts.length - (windowSize - 1); i++) {
// skip what need to be skipped
if (cSkipIdx < skips.size() && i == skips.get(cSkipIdx)) {
cSkipIdx = cSkipIdx + 1;
skipped = true;
continue;
}
// fix the current subsection
double[] subSection = Arrays.copyOfRange(ts, i, i + windowSize);
// Z normalize it
subSection = tsProcessor.znorm(subSection, nThreshold);
// perform PAA conversion if needed
double[] paa = tsProcessor.paa(subSection, paaSize);
// Convert the PAA to a string.
char[] currentString = tsProcessor.ts2String(paa, cuts);
if (!(skipped) && null != previousString) {
if (NumerosityReductionStrategy.EXACT.equals(strategy)
&& Arrays.equals(previousString, currentString)) {
// NumerosityReduction
continue;
}
else if (NumerosityReductionStrategy.MINDIST.equals(strategy)
&& checkMinDistIsZero(previousString, currentString)) {
continue;
}
}
previousString = currentString;
if (skipped) {
skipped = false;
}
saxFrequencyData.add(currentString, i);
}
return saxFrequencyData;
}
/**
* Compute the distance between the two chars based on the ASCII symbol codes.
*
* @param a The first char.
* @param b The second char.
* @return The distance.
*/
public int charDistance(char a, char b) {
return Math.abs(Character.getNumericValue(a) - Character.getNumericValue(b));
}
/**
* Compute the distance between the two strings, this function use the numbers associated with
* ASCII codes, i.e. distance between a and b would be 1.
*
* @param a The first string.
* @param b The second string.
* @return The pairwise distance.
* @throws SAXException if length are differ.
*/
public int strDistance(char[] a, char[] b) throws SAXException {
if (a.length == b.length) {
int distance = 0;
for (int i = 0; i < a.length; i++) {
int tDist = Math.abs(Character.getNumericValue(a[i]) - Character.getNumericValue(b[i]));
distance += tDist;
}
return distance;
}
else {
throw new SAXException("Unable to compute SAX distance, string lengths are not equal");
}
}
/**
* This function implements SAX MINDIST function which uses alphabet based distance matrix.
*
* @param a The SAX string.
* @param b The SAX string.
* @param distanceMatrix The distance matrix to use.
* @param n the time series length (sliding window length).
* @param w the number of PAA segments.
* @return distance between strings.
* @throws SAXException If error occurs.
*/
public double saxMinDist(char[] a, char[] b, double[][] distanceMatrix, int n, int w)
throws SAXException {
if (a.length == b.length) {
double dist = 0.0D;
for (int i = 0; i < a.length; i++) {
if (Character.isLetter(a[i]) && Character.isLetter(b[i])) {
// ... forms have numeric values from 10 through 35
int numA = Character.getNumericValue(a[i]) - 10;
int numB = Character.getNumericValue(b[i]) - 10;
int maxIdx = distanceMatrix[0].length;
if (numA > (maxIdx - 1) || numA < 0 || numB > (maxIdx - 1) || numB < 0) {
throw new SAXException(
"The character index greater than " + maxIdx + " or less than 0!");
}
double localDist = distanceMatrix[numA][numB];
dist = dist + localDist * localDist;
}
else {
throw new SAXException("Non-literal character found!");
}
}
return Math.sqrt((double) n / (double) w) * Math.sqrt(dist);
}
else {
throw new SAXException("Data arrays lengths are not equal!");
}
}
/**
* Check for trivial mindist case.
*
* @param a first string.
* @param b second string.
* @return true if mindist between strings is zero.
*/
public boolean checkMinDistIsZero(char[] a, char[] b) {
for (int i = 0; i < a.length; i++) {
if (charDistance(a[i], b[i]) > 1) {
return false;
}
}
return true;
}
/**
* Computes the distance between approximated values and the real TS.
*
* @param ts the timeseries.
* @param winSize SAX window size.
* @param paaSize SAX PAA size.
* @param normThreshold the normalization threshold.
* @return the distance value.
* @throws Exception if error occurs.
*/
public double approximationDistancePAA(double[] ts, int winSize, int paaSize,
double normThreshold) throws Exception {
double resDistance = 0d;
int windowCounter = 0;
double pointsPerWindow = (double) winSize / (double) paaSize;
for (int i = 0; i < ts.length - winSize + 1; i++) {
double[] subseries = Arrays.copyOfRange(ts, i, i + winSize);
if (tsProcessor.stDev(subseries) > normThreshold) {
subseries = tsProcessor.znorm(subseries, normThreshold);
}
double[] paa = tsProcessor.paa(subseries, paaSize);
windowCounter++;
// essentially the distance here is the distance between the segment's
// PAA value and the real TS value
//
double subsequenceDistance = 0.;
for (int j = 0; j < subseries.length; j++) {
int paaIdx = (int) Math.floor(((double) j + 0.5) / (double) pointsPerWindow);
if (paaIdx < 0) {
paaIdx = 0;
}
if (paaIdx > paa.length) {
paaIdx = paa.length - 1;
}
subsequenceDistance = subsequenceDistance + ed.distance(paa[paaIdx], subseries[j]);
}
resDistance = resDistance + subsequenceDistance / subseries.length;
}
return resDistance / (double) windowCounter;
}
/**
* Computes the distance between approximated values and the real TS.
*
* @param ts the timeseries.
* @param winSize SAX window size.
* @param paaSize SAX PAA size.
* @param alphabetSize SAX alphabet size.
* @param normThreshold the normalization threshold.
* @return the distance value.
* @throws Exception if error occurs.
*/
public double approximationDistanceAlphabet(double[] ts, int winSize, int paaSize,
int alphabetSize, double normThreshold) throws Exception {
double resDistance = 0d;
int windowCounter = 0;
double[] centralLines = na.getCentralCuts(alphabetSize);
for (int i = 0; i < ts.length - winSize + 1; i++) {
double[] subseries = Arrays.copyOfRange(ts, i, i + winSize);
double subsequenceDistance = 0.;
if (tsProcessor.stDev(subseries) > normThreshold) {
subseries = tsProcessor.znorm(subseries, normThreshold);
}
double[] paa = tsProcessor.paa(subseries, paaSize);
int[] leterIndexes = tsProcessor.ts2Index(paa, na, alphabetSize);
windowCounter++;
// essentially the distance here is the distance between the segment's
// PAA value and the real TS value
//
for (int j = 0; j < paa.length; j++) {
// compute the alphabet central cut line
int letterIdx = leterIndexes[j];
double cLine = centralLines[letterIdx];
subsequenceDistance = subsequenceDistance + ed.distance(cLine, paa[j]);
}
resDistance = resDistance + subsequenceDistance / paa.length;
}
return resDistance / (double) windowCounter;
}
/**
* Converts a single time-series into map of shingle frequencies.
*
* @param series the time series.
* @param windowSize the sliding window size.
* @param paaSize the PAA segments number.
* @param alphabetSize the alphabet size.
* @param strategy the numerosity reduction strategy.
* @param nrThreshold the SAX normalization threshold.
* @param shingleSize the shingle size.
*
* @return map of shingle frequencies.
* @throws SAXException if error occurs.
*/
public Map<String, Integer> ts2Shingles(double[] series, int windowSize, int paaSize,
int alphabetSize, NumerosityReductionStrategy strategy, double nrThreshold, int shingleSize)
throws SAXException {
// build all shingles
String[] alphabet = new String[alphabetSize];
for (int i = 0; i < alphabetSize; i++) {
alphabet[i] = String.valueOf(TSProcessor.ALPHABET[i]);
}
String[] allShingles = getAllPermutations(alphabet, shingleSize);
// result
HashMap<String, Integer> res = new HashMap<String, Integer>(allShingles.length);
for (String s : allShingles) {
res.put(s, 0);
}
// discretize
SAXRecords saxData = ts2saxViaWindow(series, windowSize, paaSize, na.getCuts(alphabetSize),
strategy, nrThreshold);
// fill in the counts
for (SAXRecord sr : saxData) {
String word = String.valueOf(sr.getPayload());
int frequency = sr.getIndexes().size();
for (int i = 0; i <= word.length() - shingleSize; i++) {
String shingle = word.substring(i, i + shingleSize);
res.put(shingle, res.get(shingle) + frequency);
}
}
return res;
}
/**
* Get all permutations of the given alphabet of given length.
*
* @param alphabet the alphabet to use.
* @param wordLength the word length.
* @return set of permutation.
*/
public static String[] getAllPermutations(String[] alphabet, int wordLength) {
// initialize our returned list with the number of elements calculated above
String[] allLists = new String[(int) Math.pow(alphabet.length, wordLength)];
// lists of length 1 are just the original elements
if (wordLength == 1)
return alphabet;
else {
// the recursion--get all lists of length 3, length 2, all the way up to 1
String[] allSublists = getAllPermutations(alphabet, wordLength - 1);
// append the sublists to each element
int arrayIndex = 0;
for (int i = 0; i < alphabet.length; i++) {
for (int j = 0; j < allSublists.length; j++) {
// add the newly appended combination to the list
allLists[arrayIndex] = alphabet[i] + allSublists[j];
arrayIndex++;
}
}
return allLists;
}
}
/**
* Generic method to convert the milliseconds into the elapsed time string.
*
* @param start Start timestamp.
* @param finish End timestamp.
* @return String representation of the elapsed time.
*/
public static String timeToString(long start, long finish) {
Duration duration = new Duration(finish - start); // in milliseconds
PeriodFormatter formatter = new PeriodFormatterBuilder().appendDays().appendSuffix("d")
.appendHours().appendSuffix("h").appendMinutes().appendSuffix("m").appendSeconds()
.appendSuffix("s").appendMillis().appendSuffix("ms").toFormatter();
return formatter.print(duration.toPeriod());
}
}